pacman::p_load(ggplot2, dplyr, party, caret, corrplot)
products2017 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributes2017.csv")
products2018 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributesPast.csv")
#Remove Na
# ncol(is.na(products2017))
# summary(is.na(products2017))
# na.omit(products2017)
# summary(products2017)
# complete.cases(products2017)
# str(products2017)
# products2017 <- products2017[complete.cases(products2017),]
# str(products2017)
#Remove the column with Na
products2017$BestSellersRank <- NULL
#Find patterns in the data
#Select only numeric values Just for the PLOTS and the CORRELATION MATRIX V.1
products2017Num <- products2017[c(-1,-2)]
str(products2017Num)
## 'data.frame': 80 obs. of 15 variables:
## $ Price : num 949 2250 399 410 1080 ...
## $ x5StarReviews : int 3 2 3 49 58 83 11 33 16 10 ...
## $ x4StarReviews : int 3 1 0 19 31 30 3 19 9 1 ...
## $ x3StarReviews : int 2 0 0 8 11 10 0 12 2 1 ...
## $ x2StarReviews : int 0 0 0 3 7 9 0 5 0 0 ...
## $ x1StarReviews : int 0 0 0 9 36 40 1 9 2 0 ...
## $ PositiveServiceReview: int 2 1 1 7 7 12 3 5 2 2 ...
## $ NegativeServiceReview: int 0 0 0 8 20 5 0 3 1 0 ...
## $ Recommendproduct : num 0.9 0.9 0.9 0.8 0.7 0.3 0.9 0.7 0.8 0.9 ...
## $ ShippingWeight : num 25.8 50 17.4 5.7 7 1.6 7.3 12 1.8 0.75 ...
## $ ProductDepth : num 23.9 35 10.5 15 12.9 ...
## $ ProductWidth : num 6.62 31.75 8.3 9.9 0.3 ...
## $ ProductHeight : num 16.9 19 10.2 1.3 8.9 ...
## $ ProfitMargin : num 0.15 0.25 0.08 0.08 0.09 0.05 0.05 0.05 0.05 0.05 ...
## $ Volume : int 12 8 12 196 232 332 44 132 64 40 ...
# Visualize the data. First approach
# for (col in 1:ncol(products2017Num)) {
# hist(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6", breaks=50 )}
#
# for (col in 1:ncol(products2017Num)) {
# boxplot(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6" )}
#Decision Tree 1
products2017_ctree <- ctree(Volume ~ ., data=products2017, controls = ctree_control(maxdepth = 4))
plot(products2017_ctree)

#Correlation Matrix
corrData <- cor(products2017Num)
corrplot(corrData)

#Select variables. A. Without 5StarReviews
products2017CombA <- products2017[c(-2, -4)]
#Decision Tree. A
products2017_ctreeA <- ctree(Volume ~ ., data=products2017CombA, controls = ctree_control(maxdepth = 4))
plot(products2017_ctreeA)

#Positive and 4StarReviews are the most important variables.
#FEATURE ENGINEERING
#Join the Stars Attributes because they are very correlated
products2017CombMerg <- products2017
products2017CombMerg <- products2017[c(-4:-8)]
products2017CombMerg$GoodReviews <- products2017$x4StarReviews + products2017$x3StarReviews
products2017CombMerg$BadReviews <- products2017$x2StarReviews + products2017$x1StarReviews
#Decision Tree B
products2017_ctreeB <- ctree(Volume ~ ., data=products2017CombMerg, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeB)

#Select variables:: Feature Eng. B.2 Without 5StarReviews, Without Categorical Variables
products2017CombMerg2 <- products2017CombMerg[c(-1, -2)]
#Correlation Matrix B
corrDataMerg2 <- cor(products2017CombMerg2)
corrplot(corrDataMerg2)

# Visualize the data. B
for (col in 1:ncol(products2017CombMerg2)) {
hist(products2017CombMerg2[,col], main = names(products2017CombMerg2[col]), xlab=colnames(products2017CombMerg2[col]), border = "black", col = "#bcd4e6", breaks=200 )}












#There are some outliers in PositiveServiceReview and in the GoodReviews and BadReviews.
#So let's analyze how is their distribution
GoodReviewsNums <- products2017CombMerg %>% group_by(GoodReviews) %>% summarise (n = n())
GoodReviewsOutliers <- products2017CombMerg %>% filter(GoodReviews == '35')
#Products from 134 to 141 are duplicated (the only difference between them is the price)
#So let's remove them
#Select Rows:: Feature Eng. C Without duplicated rows
products2017CombMergClean <- products2017CombMerg[-c(34:41),]
#Select Rows:: Feature Eng. D All variables withour duplicated rows
products2017Clean <- products2017[-c(34:41),]
#Decision Tree C
products2017_ctreeC <- ctree(Volume ~ ., data=products2017CombMergClean, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeC)

#Creating dummy variables------
#Select variables:: Feature Eng. C.2 Without Categorical Variables
products2017CombMergClean2 <- products2017CombMergClean[c(-1, -2)]
# Visualize the data. C
for (col in 1:ncol(products2017CombMergClean2)) {
hist(products2017CombMergClean2[,col], main = names(products2017CombMergClean2[col]), xlab=colnames(products2017CombMergClean2[col]), border = "black", col = "#bcd4e6", breaks=200 )}












ggplot(products2017CombMergClean, aes(x=ProductType, fill=Volume)) + geom_bar()

#We can see that there are still many outliers.
#Detect outliers from the Standard Deviation
findOutlier <- function(products2017Clean, cutoff = 3) {
## Calculate the sd
sdproducts2017Clean <- apply(products2017Clean, 2, sd, na.rm = TRUE)
## Identify the cells with value greater than cutoff * sd (column wise)
result <- mapply(function(d, s) {
which(d > cutoff * s)
}, products2017Clean, sdproducts2017Clean)
result
}
outliers <- findOutlier(products2017Clean)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm
## = na.rm): NAs introduced by coercion
## Warning in Ops.factor(d, cutoff * s): '>' not meaningful for factors
outliers
## $ProductType
## integer(0)
##
## $ProductNum
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## [70] 70 71 72
##
## $Price
## [1] 2 5 49
##
## $x5StarReviews
## [1] 42 65
##
## $x4StarReviews
## [1] 40 42 65
##
## $x3StarReviews
## [1] 23 40 42 65
##
## $x2StarReviews
## [1] 23 42
##
## $x1StarReviews
## [1] 23
##
## $PositiveServiceReview
## [1] 18 42
##
## $NegativeServiceReview
## [1] 23 59
##
## $Recommendproduct
## [1] 1 2 3 4 5 7 8 9 10 13 14 15 16 17 18 19 20 21 22 24 25 26 28
## [24] 30 31 35 36 37 38 39 40 42 43 44 45 46 47 48 49 51 52 53 55 58 59 60
## [47] 61 63 65 66 67 68 69 71 72
##
## $ShippingWeight
## [1] 2 27 28 60
##
## $ProductDepth
## [1] 45
##
## $ProductWidth
## [1] 2 26 28 40 58
##
## $ProductHeight
## [1] 28 50
##
## $ProfitMargin
## [1] 25 29 30 31 32 33
##
## $Volume
## [1] 42 65
removeOutlier <- function(products2017Clean, outliers) {
result <- mapply(function(d, o) {
res <- d
res[o] <- NA
return(res)
}, products2017Clean, outliers)
return(as.data.frame(result))
}
#adding again the product numbers
#products2017CleanFilter$ProductNum <- products2017Clean$ProductNum
#adding NA to the outliers
products2017CleanFilter <- removeOutlier(products2017Clean, outliers)
products2017CleanFilter$NumOutliers <- apply(products2017CleanFilter, 1, function(x) sum(is.na(x))-1)
nrow(is.na(products2017CleanFilter))
## [1] 72
hist(products2017CleanFilter$NumOutliers)

hist(products2017CleanFilter$Price)

hist(products2017Clean$Price)

products2017Clean %>% filter(Price > 1000)
## ProductType ProductNum Price x5StarReviews x4StarReviews x3StarReviews
## 1 PC 102 2249.99 2 1 0
## 2 Laptop 105 1079.99 58 31 11
## 3 Display 157 1276.57 164 33 6
## x2StarReviews x1StarReviews PositiveServiceReview NegativeServiceReview
## 1 0 0 1 0
## 2 7 36 7 20
## 3 13 6 12 4
## Recommendproduct ShippingWeight ProductDepth ProductWidth ProductHeight
## 1 0.9 50 35.0 31.75 19.0
## 2 0.7 7 12.9 0.30 8.9
## 3 0.9 23 15.5 17.70 5.7
## ProfitMargin Volume
## 1 0.25 8
## 2 0.09 232
## 3 0.25 656
#RecommendProduct results had many outliers. Why?
plot(products2017$Recommendproduct)

hist(products2017$Recommendproduct)

RecommendNums <- products2017Clean %>% group_by(Recommendproduct) %>% summarise (n = n())
RecommendproductOutliers <- products2017Clean %>% filter(Recommendproduct > 0.6) #Product 150, 198
#See the rows with more outliers and compare it with the table with the NA's
GeneralOutliers <- products2017Clean %>% filter(ProductNum == '198'| ProductNum == '150'| ProductNum == '102'| ProductNum == '123'| ProductNum == '128'| ProductNum == '148' )
#Conlusion about Outliers: 1. Products 123, 150, 198 should be removed because the have many outliers. 2. The attribute Recommendproduct has many otliers and should be removed. 3. The products with high price wont be deleted yet, first let's compare the models with and without them.
#Remove rows with outliers
OutlierstoRemove <- c(198,150,123)
products2017Clean2 <- products2017Clean %>% filter(!ProductNum %in% OutlierstoRemove )
products2017Clean2
## ProductType ProductNum Price x5StarReviews x4StarReviews
## 1 PC 101 949.00 3 3
## 2 PC 102 2249.99 2 1
## 3 PC 103 399.00 3 0
## 4 Laptop 104 409.99 49 19
## 5 Laptop 105 1079.99 58 31
## 6 Accessories 106 114.22 83 30
## 7 Accessories 107 379.99 11 3
## 8 Accessories 108 65.29 33 19
## 9 Accessories 109 119.99 16 9
## 10 Accessories 110 16.99 10 1
## 11 Accessories 111 6.55 21 2
## 12 Accessories 112 15.00 75 25
## 13 Accessories 113 52.50 10 8
## 14 Accessories 114 21.08 313 62
## 15 Accessories 115 18.98 349 118
## 16 Accessories 116 3.60 8 6
## 17 Accessories 117 3.60 11 5
## 18 Accessories 118 174.99 170 100
## 19 Accessories 119 9.99 15 12
## 20 Software 120 670.00 20 2
## 21 Software 121 133.08 34 15
## 22 Software 122 124.99 394 187
## 23 Software 124 48.50 29 18
## 24 Software 125 31.99 77 31
## 25 Display 126 179.99 306 114
## 26 Printer 127 396.35 8 0
## 27 Printer 128 262.98 22 8
## 28 PrinterSupplies 129 83.11 0 0
## 29 PrinterSupplies 130 26.78 6 2
## 30 PrinterSupplies 131 43.22 5 0
## 31 ExtendedWarranty 132 79.98 0 1
## 32 ExtendedWarranty 133 87.98 5 0
## 33 PC 142 609.99 21 7
## 34 Laptop 143 770.60 22 14
## 35 Accessories 144 3.84 92 29
## 36 Accessories 145 6.99 384 31
## 37 Accessories 146 59.99 366 59
## 38 Accessories 147 38.66 209 83
## 39 Accessories 148 10.75 535 261
## 40 Accessories 149 126.95 13 2
## 41 Accessories 151 103.85 3 0
## 42 Accessories 152 52.72 51 33
## 43 Accessories 153 19.99 474 38
## 44 Accessories 154 16.24 108 54
## 45 Accessories 155 69.00 226 37
## 46 Display 156 359.99 90 27
## 47 Display 157 1276.57 164 33
## 48 Display 158 783.98 26 13
## 49 Display 159 149.99 21 10
## 50 Printer 160 129.99 74 25
## 51 Printer 161 128.49 58 33
## 52 Printer 162 141.95 4 0
## 53 Printer 163 149.99 8 3
## 54 Printer 164 165.99 2 0
## 55 Printer 165 169.26 20 13
## 56 Printer 166 132.36 0 1
## 57 Printer 167 149.99 206 89
## 58 Printer 168 395.00 8 0
## 59 Printer 169 385.96 99 43
## 60 Netbook 177 379.99 1 0
## 61 Tablet 185 499.00 148 66
## 62 Smartphone 190 199.00 4 1
## 63 Tablet 188 499.00 86 51
## 64 Tablet 189 419.00 3 1
## 65 Smartphone 191 200.00 62 25
## 66 Smartphone 192 99.00 18 17
## 67 Netbook 182 349.99 22 10
## 68 Smartphone 197 499.00 368 28
## 69 GameConsole 200 299.99 421 87
## x3StarReviews x2StarReviews x1StarReviews PositiveServiceReview
## 1 2 0 0 2
## 2 0 0 0 1
## 3 0 0 0 1
## 4 8 3 9 7
## 5 11 7 36 7
## 6 10 9 40 12
## 7 0 0 1 3
## 8 12 5 9 5
## 9 2 0 2 2
## 10 1 0 0 2
## 11 2 4 15 2
## 12 6 3 3 9
## 13 5 0 1 2
## 14 13 8 16 44
## 15 27 7 5 57
## 16 3 2 1 0
## 17 2 2 1 0
## 18 23 20 20 310
## 19 4 0 4 3
## 20 4 2 4 4
## 21 2 2 10 5
## 22 63 42 86 55
## 23 3 1 8 4
## 24 7 3 2 7
## 25 25 22 28 42
## 26 1 0 2 1
## 27 3 1 3 5
## 28 0 1 3 1
## 29 0 0 1 1
## 30 0 0 0 1
## 31 1 1 1 0
## 32 2 0 1 0
## 33 3 0 12 5
## 34 4 5 6 6
## 35 8 4 10 12
## 36 13 5 20 50
## 37 23 26 24 60
## 38 35 38 36 15
## 39 134 104 177 120
## 40 2 2 5 5
## 41 0 0 0 1
## 42 8 7 4 6
## 43 7 6 12 80
## 44 12 2 6 7
## 45 3 7 8 13
## 46 10 4 4 7
## 47 6 13 6 12
## 48 7 5 16 4
## 49 3 1 4 4
## 50 7 6 9 4
## 51 10 3 6 5
## 52 0 0 3 0
## 53 3 2 0 0
## 54 1 1 2 1
## 55 8 6 21 4
## 56 0 0 0 0
## 57 20 22 65 42
## 58 1 0 2 3
## 59 17 11 20 8
## 60 1 1 0 0
## 61 30 20 29 12
## 62 0 2 2 1
## 63 17 12 9 14
## 64 0 0 0 0
## 65 10 11 12 9
## 66 6 2 12 5
## 67 6 2 10 3
## 68 14 10 23 22
## 69 20 14 39 29
## NegativeServiceReview Recommendproduct ShippingWeight ProductDepth
## 1 0 0.9 25.80 23.94
## 2 0 0.9 50.00 35.00
## 3 0 0.9 17.40 10.50
## 4 8 0.8 5.70 15.00
## 5 20 0.7 7.00 12.90
## 6 5 0.3 1.60 5.80
## 7 0 0.9 7.30 6.70
## 8 3 0.7 12.00 7.90
## 9 1 0.8 1.80 10.60
## 10 0 0.9 0.75 10.70
## 11 1 0.5 1.00 7.30
## 12 2 0.2 2.20 21.30
## 13 0 0.8 1.10 15.60
## 14 3 0.9 0.35 5.70
## 15 3 0.9 0.60 1.70
## 16 0 0.8 0.01 11.50
## 17 0 0.8 0.01 11.50
## 18 6 0.8 1.40 13.80
## 19 1 0.8 0.40 11.10
## 20 3 0.7 0.25 5.80
## 21 4 0.7 3.20 7.40
## 22 38 0.8 0.15 7.60
## 23 2 0.8 0.20 8.00
## 24 0 0.9 0.20 0.00
## 25 12 0.8 13.70 8.50
## 26 1 0.3 63.00 17.90
## 27 1 0.8 57.00 17.30
## 28 1 0.1 10.30 0.00
## 29 0 0.9 1.00 3.30
## 30 0 1.0 1.00 4.70
## 31 3 0.4 0.20 0.00
## 32 1 0.6 0.20 0.00
## 33 3 0.6 29.10 20.95
## 34 2 0.7 3.54 12.72
## 35 3 0.8 1.00 6.50
## 36 1 0.9 1.00 108.00
## 37 5 0.8 2.40 7.90
## 38 4 0.7 0.40 3.60
## 39 15 0.7 1.50 11.50
## 40 1 0.6 3.00 5.90
## 41 0 0.9 8.00 4.80
## 42 1 0.8 4.00 6.60
## 43 2 0.9 1.50 300.00
## 44 3 0.9 0.55 6.00
## 45 1 0.9 3.80 15.00
## 46 3 0.9 7.00 9.20
## 47 4 0.9 23.00 15.50
## 48 5 0.6 25.00 29.20
## 49 2 0.8 10.00 20.00
## 50 2 0.9 32.20 15.70
## 51 2 0.9 22.70 15.70
## 52 1 0.5 25.00 19.50
## 53 0 0.7 35.00 10.20
## 54 1 0.5 31.00 22.10
## 55 7 0.5 32.00 15.10
## 56 0 0.8 30.20 20.90
## 57 50 0.7 13.00 8.80
## 58 0 0.8 63.00 17.90
## 59 13 0.7 39.00 21.00
## 60 1 0.3 3.00 7.44
## 61 6 0.8 2.20 7.10
## 62 1 0.5 1.10 4.50
## 63 2 0.8 2.00 10.10
## 64 0 0.9 2.20 7.00
## 65 3 0.8 0.90 2.80
## 66 4 0.7 0.70 2.80
## 67 3 0.3 5.00 7.57
## 68 3 0.9 0.90 2.70
## 69 14 0.9 10.94 12.00
## ProductWidth ProductHeight ProfitMargin Volume
## 1 6.62 16.89 0.15 12
## 2 31.75 19.00 0.25 8
## 3 8.30 10.20 0.08 12
## 4 9.90 1.30 0.08 196
## 5 0.30 8.90 0.09 232
## 6 4.00 1.00 0.05 332
## 7 10.30 11.50 0.05 44
## 8 6.70 2.20 0.05 132
## 9 9.40 4.70 0.05 64
## 10 13.10 0.60 0.05 40
## 11 7.00 1.60 0.05 84
## 12 1.80 7.80 0.05 300
## 13 3.00 15.00 0.05 40
## 14 3.50 8.30 0.05 1252
## 15 13.50 10.20 0.05 1396
## 16 8.50 0.40 0.05 32
## 17 8.50 0.40 0.05 44
## 18 8.20 0.40 0.05 680
## 19 7.60 0.50 0.05 60
## 20 1.40 7.80 0.15 80
## 21 5.50 1.40 0.15 136
## 22 5.50 1.20 0.20 1576
## 23 7.00 1.00 0.20 116
## 24 0.00 0.00 0.30 308
## 25 22.30 17.50 0.08 1224
## 26 15.90 12.70 0.12 32
## 27 23.50 25.80 0.12 88
## 28 0.00 0.00 0.35 0
## 29 1.60 4.70 0.30 24
## 30 2.90 6.30 0.30 20
## 31 0.00 0.00 0.40 0
## 32 0.00 0.00 0.40 20
## 33 8.47 20.71 0.09 84
## 34 8.90 0.71 0.15 88
## 35 4.20 1.50 0.05 368
## 36 0.50 0.50 0.05 1536
## 37 1.40 19.10 0.05 1464
## 38 6.50 8.80 0.05 836
## 39 20.00 7.20 0.05 2140
## 40 1.80 6.70 0.05 52
## 41 1.30 4.80 0.05 12
## 42 5.50 5.20 0.05 204
## 43 0.50 0.50 0.05 1896
## 44 3.00 8.20 0.05 432
## 45 9.80 18.90 0.05 904
## 46 11.20 3.20 0.20 360
## 47 17.70 5.70 0.25 656
## 48 9.90 23.00 0.16 104
## 49 15.30 6.40 0.17 84
## 50 15.90 12.40 0.10 296
## 51 15.90 10.60 0.12 232
## 52 18.00 14.00 0.14 16
## 53 15.98 14.55 0.18 32
## 54 18.60 13.50 0.18 8
## 55 11.70 19.60 0.16 80
## 56 20.90 14.60 0.20 0
## 57 13.70 7.60 0.15 824
## 58 15.90 12.70 0.09 32
## 59 15.40 17.90 0.11 396
## 60 10.43 1.02 0.10 4
## 61 10.40 0.30 0.13 592
## 62 2.50 0.50 0.10 16
## 63 6.90 0.38 0.20 344
## 64 10.20 0.40 0.18 12
## 65 5.40 0.30 0.14 248
## 66 5.30 0.40 0.17 72
## 67 10.47 1.43 0.12 88
## 68 5.00 0.40 0.10 1472
## 69 11.50 7.25 0.12 1684
#RecommendProduct
# box_plot <- boxplot(products2017[, c("","")])
#sort(products2017Num$NegativeServiceReview)